*** This code provides summary stats on the pre-populated return data

cap log close
clear
set more off
set type double
set linesize 150

log using "$logdir/analysis/summary_stats.log" , replace


local thresh 100


****************
* Prepare data *
****************

use "$statadir/soi_cdw_taxsim_2019.dta" if nonfiler==0 , clear

count

quietly do "$adir/get_bad_dummies"
get_bad_dummies
*NOTE: this calculates the item-based failure indicators at the default tolerance of $100

drop bad_any

egen double bad_total = rowtotal(bad_*)

gen double age_prim = 2019 - soi_prim_yob if !missing(soi_prim_yob) & soi_prim_yob!=0

foreach xx of numlist 1 4 {
	gen double delta_tax`xx'     = taxsim`xx'_fed_incm_tax - soi_fed_incm_tax
	gen double delta_tax`xx'_pos = delta_tax`xx' if delta_tax`xx'>0
	gen double delta_tax`xx'_neg = delta_tax`xx' if delta_tax`xx'<0
	
	gen byte d_correct`thresh'_`xx' = (abs(delta_tax`xx')<`thresh')
	gen byte d_over`thresh'_`xx'    = (delta_tax`xx'>`thresh')
	gen byte d_under`thresh'_`xx'   = (delta_tax`xx'<-`thresh')
	
	gen double delta_tax_over`thresh'_`xx'  =  delta_tax`xx' if d_over`thresh'_`xx'==1
	gen double delta_tax_under`thresh'_`xx' = -delta_tax`xx' if d_under`thresh'_`xx'==1
}

gen byte d_married  = inrange(soi_fil_stat,2,3)
gen byte d_deps     = (soi_deps_tot>0)
gen byte d_eitc     = (soi_eitc>0)
gen byte d_ctc      = (soi_ctc_refundable>0 | soi_ctc_nonrefundable>0)
gen byte d_itemizer = (soi_fded==1)
gen byte d_sched_c  = (soi_sched_c_netincm!=0)
gen byte d_paidprep = (soi_prep==1)
gen byte d_one_bad  = (bad_total==1)



**********************
* Stats in the paper *
**********************

** what is the total tax liability on pre-populated returns, among filers with pre-populated returns?
preserve
	keep if !missing(taxsim1_fed_incm_tax)
	gen freq = 1
	collapse (sum) taxsim1_fed_incm_tax soi_fed_incm_tax freq [iw=soi_wgt] , fast
	replace taxsim1_fed_incm_tax = round(taxsim1_fed_incm_tax/1e9)
	replace soi_fed_incm_tax     = round(soi_fed_incm_tax/1e9)
	replace freq                 = round(freq/1e6)
	list
restore

** what is the total tax liability on pre-populated returns, among filers with pre-populated returns, separately by top-5%-income vs not?
preserve
	keep if !missing(taxsim1_fed_incm_tax)
	gen freq = 1
	gen byte d_top5 = (agi_bin==20)
	collapse (sum) taxsim1_fed_incm_tax soi_fed_incm_tax freq [iw=soi_wgt] , by (d_top5) fast
	replace taxsim1_fed_incm_tax = round(taxsim1_fed_incm_tax/1e9)
	replace soi_fed_incm_tax     = round(soi_fed_incm_tax/1e9)
	replace freq                 = round(freq/1e6)
	list
restore

** what is the total tax liability on pre-populated returns, among filers without pre-populated returns?
preserve
	keep if missing(taxsim1_fed_incm_tax)
	gen freq = 1
	collapse (sum) cdw_withholding soi_fed_incm_tax freq [iw=soi_wgt] , fast
	replace cdw_withholding  = round(cdw_withholding/1e9)
	replace soi_fed_incm_tax = round(soi_fed_incm_tax/1e9)
	replace freq             = round(freq/1e6)
	list
restore

** what if taxpayers made only favorable changes? how much is attributable to the top 10%? top 1%?
preserve
	_pctile soi_agi [iw = soi_wgt] , p(90, 99)
	local p90 = r(r1)
	local p99 = r(r2)
	gen delta_tax1_pos_top10 = (soi_agi>`p90')*delta_tax1_pos
	gen delta_tax1_pos_top1  = (soi_agi>`p99')*delta_tax1_pos
	keep if !missing(taxsim1_fed_incm_tax)
	collapse (sum) delta_tax1_pos*  [iw=soi_wgt] , fast
	gen frac_top10 = delta_tax1_pos_top10/delta_tax1_pos
	gen frac_top1  = delta_tax1_pos_top1/delta_tax1_pos
	list
restore

** what if taxpayers made only unfavorable changes?
preserve
	keep if !missing(taxsim1_fed_incm_tax)
	collapse (sum) delta_tax1_neg  [iw=soi_wgt] , fast
	list
restore



************************
* Table: summary stats *
************************

sum d_married d_deps d_paidprep d_eitc d_ctc d_itemizer d_sched_c d_one_bad [aw=soi_wgt]
sum age_prim soi_deps_tot soi_agi soi_txbl_incm soi_fed_incm_tax [aw=soi_wgt] , d


cap log close
